import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec
%matplotlib inline
sns.set()
sales_data = pd.read_csv('data/sales_data.csv')
sales_data.head()
sales_data.info()
#looking for unique product
sales_data['Product'].unique()
#looking for unique seller
sales_data['Seller'].unique()
pd.crosstab( sales_data.Seller,sales_data.Product).T.style.background_gradient(cmap='summer_r')
pd.crosstab(sales_data.Strat_deal, sales_data.Status).T.style.background_gradient(cmap='summer_r')
sns.catplot(x="Strat_deal", hue="Status", data=sales_data,kind="count", palette="muted")
plt.xlabel('Start Deal')
plt.ylabel('Count')
plt.title('Start deal result based on Won|Loss')
sns.catplot(y="Client", hue="Status", data=sales_data,
kind="count", palette="muted")
sns.catplot(y="Source", hue="Status", data=sales_data,
kind="count", palette="muted")
plt.xticks(rotation='vertical')
sns.catplot(y="Att_t_client", hue="Status", data=sales_data,
kind="count", palette="muted")
plt.xticks(rotation='vertical')
def plot_of_count_and_pie(column_name):
f= plt.figure(figsize=(45,20))
gs = gridspec.GridSpec(nrows=1, ncols=2, width_ratios=[2, 1])
ax0 = f.add_subplot(gs[0, 0])
sns.countplot(x=column_name, data=sales_data,order = sales_data[column_name].value_counts().index)
plt.xlabel(column_name,fontsize=42)
plt.ylabel('Count', fontsize=42)
plt.xticks(rotation = 'vertical',fontsize=32)
plt.yticks(fontsize=32)
explode = []
total = 0
size=len(sales_data[column_name].unique())
for i in range(size):
total = total + 0.060
explode.append(total)
ax0 = f.add_subplot(gs[0, 1])
sales_data[column_name].value_counts().plot.pie(autopct='%1.1f%%',shadow=True,textprops={'fontsize': 24},explode=explode)
plt.xlabel(column_name,fontsize=42)
plt.ylabel('Count', fontsize=42)
plt.xticks(fontsize=32)
plt.tight_layout()
plt.show()
Fonksiyon bize bar char ve pie chart döndürecek ve böylece bütün sütünların sayısal ve yüzdesel olrak dağılımları görülebilir.
%matplotlib inline
for i in column_list:
plot_of_count_and_pie(i)
#select the label which is status column
y = sales_data['Status']
#Set the other column as features
x= sales_data.drop('Status', axis=1)
#all columns consist of category convert them to category
x = x.astype('category')
x.info()
won_loss = {'Won':1, 'Lost':0}
y = [won_loss[item] for item in y]
#wıth dummy
X = pd.get_dummies(x)
X.head()
from sklearn.model_selection import train_test_split #training and testing data split
from sklearn.linear_model import LogisticRegression #logistic regression
from sklearn import metrics #accuracy measure
from sklearn import svm #support vector Machine
from sklearn.tree import DecisionTreeClassifier #Decision Tree
from sklearn.ensemble import RandomForestClassifier #Random Forest
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn.naive_bayes import GaussianNB #Naive bayes
from sklearn.tree import DecisionTreeClassifier #Decision Tree
from sklearn.metrics import confusion_matrix #for confusion matrix
X_train, X_valid, y_train, y_valid = train_test_split(X,y, test_size=0.3, random_state=0)
model_log = LogisticRegression(solver='liblinear')
model_log.fit(X_train, y_train)
prediction_log = model_log.predict(X_valid)
print('The accuracy of the Logistic Regression is',metrics.accuracy_score(prediction_log, y_valid))
model_svm_l = svm.SVC(kernel='linear', C=0.1, gamma=0.1)
model_svm_l.fit(X_train, y_train)
prediction_svm_l = model_svm_l.predict(X_valid)
print('The accuracy of the Linear Support Vector Machine is ', metrics.accuracy_score(prediction_svm_l, y_valid))
model_rbf = svm.SVC(kernel='rbf', C=0.1, gamma=0.1)
model_rbf.fit(X_train, y_train)
prediction_rbf = model_rbf.predict(X_valid)
print('The accuracy of the Radical Support Vector Machine is ', metrics.accuracy_score(prediction_rbf, y_valid))
model_tree = DecisionTreeClassifier()
model_tree.fit(X_train, y_train)
prediction_tree = model_tree.predict(X_valid)
print('The accuracy of the Decision Tree is ', metrics.accuracy_score(prediction_tree, y_valid))
Cross validation yöntemi ile elde edilen skorun doğruluğunun rasgele olmaması ve belli bir veriye bağlı olup olmadığı kontrol edilmektedir.Burda veri 10 eşit parçaya ayrıldı ve her bir parça iterasyonla test verisi olarak kullanıldı. Bura farklı modeller için sonuçlar aşağıda listelendi.
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier
kfold =KFold(n_splits=10, random_state=22)
xyz = []
accuracy = []
std = []
classifiers = ['Linear Svm', 'Radial Svm', 'Logistic Regression', 'KNN', 'Decision Tree', 'Naive Bayes' , 'Random Forest']
models = [svm.SVC(kernel='linear'), svm.SVC(kernel='rbf'), LogisticRegression(solver='liblinear'), KNeighborsClassifier(n_neighbors=9),
DecisionTreeClassifier(), GaussianNB(), RandomForestClassifier(n_estimators=100)]
for i in models:
model = i
cv_result = cross_val_score(model,X,y, cv=kfold,scoring='accuracy')
cv_result =cv_result
xyz.append(cv_result.mean())
std.append(cv_result.std())
accuracy.append(cv_result)
new_models_data_frame = pd.DataFrame({'CV Mean': xyz, 'Std': std}, index=classifiers)
new_models_data_frame
plt.subplots(figsize=(12,6))
plt.xticks(rotation=45)
sns.boxplot(new_models_data_frame.index, accuracy)
f, ax =plt.subplots(3,3, figsize=(12,10))
y_pred = cross_val_predict(svm.SVC(kernel='linear'),X,y,cv=10)
sns.heatmap(confusion_matrix(y,y_pred), ax=ax[0,0], annot=True,fmt='2.0f')
ax[0,0].set_title('Linear SVM')
y_pred = cross_val_predict(svm.SVC(kernel='rbf'),X,y,cv=10)
sns.heatmap(confusion_matrix(y,y_pred), ax=ax[0,1], annot=True,fmt='2.0f')
ax[0,1].set_title('Radical SVM')
y_pred = cross_val_predict(KNeighborsClassifier(n_neighbors=9) ,X,y,cv=10)
sns.heatmap(confusion_matrix(y,y_pred), ax=ax[0,2], annot=True,fmt='2.0f')
ax[0,2].set_title('KNN')
y_pred = cross_val_predict(LogisticRegression(solver='liblinear') ,X,y,cv=10)
sns.heatmap(confusion_matrix(y,y_pred), ax=ax[1,0], annot=True,fmt='2.0f')
ax[1,0].set_title('Logistic Regression')
y_pred = cross_val_predict(RandomForestClassifier(n_estimators=100) ,X,y,cv=10)
sns.heatmap(confusion_matrix(y,y_pred), ax=ax[1,1], annot=True,fmt='2.0f')
ax[1,1].set_title('Random Forest')
y_pred = cross_val_predict(DecisionTreeClassifier() ,X,y,cv=10)
sns.heatmap(confusion_matrix(y,y_pred), ax=ax[1,2], annot=True,fmt='2.0f')
ax[1,2].set_title('Decision Tree')
y_pred = cross_val_predict(GaussianNB() ,X,y,cv=10)
sns.heatmap(confusion_matrix(y,y_pred), ax=ax[2,0], annot=True,fmt='2.0f')
ax[2,0].set_title('Naive Bayes')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.show()
from sklearn.model_selection import GridSearchCV
C=[0.05, 0.1, 0.2, 0.25, 0.3, 0.35, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
gamma=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
kernel=['rbf','linear']
hyper = {'kernel':kernel, 'C':C, 'gamma':gamma}
gd =GridSearchCV(estimator=svm.SVC(), param_grid=hyper, verbose=True)
gd.fit(X,y)
print(gd.best_score_)
print(gd.best_estimator_)
model_rbf = svm.SVC(kernel='rbf', C=0.1, gamma=0.1)
model_rbf.fit(X_train, y_train)
prediction_rbf = model_rbf.predict(X_valid)
print('The accuracy of the Radical Support Vector Machine is ', metrics.accuracy_score(prediction_rbf, y_valid))
n_estimator =range(50, 1000, 50)
hyper = {'n_estimators': n_estimator}
gd = GridSearchCV(estimator=RandomForestClassifier(random_state=0), param_grid=hyper, verbose=True)
gd.fit(X,y)
print(gd.best_score_)
print(gd.best_estimator_)
from sklearn.ensemble import BaggingClassifier
model_bag = BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=3), random_state=0, n_estimators=800)
model_bag.fit(X_train, y_train)
prediction_bag = model_bag.predict(X_valid)
print('The accuracy for bagged KNN is:',metrics.accuracy_score(prediction_bag,y_valid))
result=cross_val_score(model_bag,X,y,cv=10,scoring='accuracy')
print('The cross validated score for bagged KNN is:',result.mean())
from sklearn.ensemble import VotingClassifier
ensemble = VotingClassifier(estimators=[('KNN', KNeighborsClassifier(n_neighbors=9)),
('RBF', svm.SVC(kernel='rbf',probability=True,C=0.4,gamma=0.1)),
('RFor', RandomForestClassifier(n_estimators=900, random_state=0)),
('LR', LogisticRegression(C=0.05)),
('DT', DecisionTreeClassifier(random_state=0)),
('NB', GaussianNB()),
('Svm', svm.SVC(kernel='linear',probability=True))],
voting='soft').fit(X_train, y_train)
print('The accuracy for ensembled model is:',ensemble.score(X_valid,y_valid))
cross=cross_val_score(ensemble,X,y, cv = 10,scoring = "accuracy")
print('The cross validated score is',cross.mean())
Modellere göre sütün seçimleri ile ilgili çalışmalar.
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xg
f, ax = plt.subplots(2,2, figsize=(15,45))
model=RandomForestClassifier(n_estimators=500,random_state=0)
model.fit(X,y)
pd.Series(model.feature_importances_,X.columns).sort_values(ascending=True).plot.barh(width=0.8,ax=ax[0,0], cmap='Set3')
ax[0,0].set_title('Feature Importance in Random Forests')
model=AdaBoostClassifier(n_estimators=200,learning_rate=0.05,random_state=0)
model.fit(X,y)
pd.Series(model.feature_importances_,X.columns).sort_values(ascending=True).plot.barh(width=0.8,ax=ax[0,1],color='lightcoral')
ax[0,1].set_title('Feature Importance in AdaBoost')
model=GradientBoostingClassifier(n_estimators=500,learning_rate=0.1,random_state=0)
model.fit(X,y)
pd.Series(model.feature_importances_,X.columns).sort_values(ascending=True).plot.barh(width=0.8,ax=ax[1,0],color='lightgreen')
ax[1,0].set_title('Feature Importance in Gradient Boosting')
model=xg.XGBClassifier(n_estimators=900,learning_rate=0.1)
model.fit(X,y)
pd.Series(model.feature_importances_,X.columns).sort_values(ascending=True).plot.barh(width=0.8,ax=ax[1,1],color='violet')
ax[1,1].set_title('Feature Importance in XgBoost')
plt.subplots_adjust(wspace=0.5, hspace=0.5)
plt.show()